#to make life easier
%load_ext autoreload
%autoreload 2
import autograd.numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objs as go
from src.models import Model, Prior, Conditional_model
from src.sampling import Metropolis_Hastings as MH
from src.helpers import samples_exploration
from src.optimization import gradient_descent as GD
df = pd.read_csv("data/wages.dat",
sep="\s+")
#normalization of continuous variables
df.LNWAGE = (df.LNWAGE - np.mean(df.LNWAGE.values))/np.std(df.LNWAGE.values)
df.ED =(df.ED - np.mean(df.ED.values))/np.std(df.ED.values)
df.EX =(df.EX - np.mean(df.EX.values))/np.std(df.EX.values)
#dropping correlated variables
df = df.drop(columns = ["EXSQ","AGE"], axis = 1)
#building predictors and response matrices
predictors = df[['ED', 'SOUTH', 'NONWH', 'HISP', 'FE', 'MARR', 'MARRFE', 'EX',
'UNION', 'MANUF', 'CONSTR', 'MANAG', 'SALES', 'CLER',
'SERV', 'PROF']]
y = df.LNWAGE.values
X = predictors.values
#splitting train and test
X, X_test, Y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_gaussian = Model.Model(Prior.Gaussian_prior,Conditional_model.Gaussian, Prior = [0,3,2], data = X, response = Y)
model_student = Model.Model(Prior.Student_prior, Conditional_model.Gaussian, Prior = [0,3,2,1/4], data = X, response = Y)
theta_gd_student = GD.vanilla_gd(model_student, max_iter= 1000)
theta_gd_gaussian = GD.vanilla_gd(model_gaussian, max_iter= 1000)
with warnings.catch_warnings(record=True):
initial = np.random.randn(model_gaussian.beta_size+1)
initial[0]=0.5
samples_gaussian = MH.random_walk_MH(model_gaussian, max_iter = 20000, verbose = True, step_size = 0.035, initial = initial)
initial[0]=20
samples_student = MH.random_walk_MH(model_student, max_iter = 20000, verbose = True, step_size = 0.035, initial = initial)
with warnings.catch_warnings(record=True):
print("--------------------------------- gaussian samples-----------------------------------")
samples_exploration(samples_gaussian[2500:], correlation= False)
with warnings.catch_warnings(record=True):
print("--------------------------------- student samples-----------------------------------")
samples_exploration(samples_student[2500:], correlation= False)
we use a burn in period of 2500
theta_MH_student = np.mean(samples_student[2500:],axis=0)
theta_MH_gaussian = np.mean(samples_gaussian[2500:],axis=0)
beta_OLS = np.linalg.inv(X.T@X)@X.T@Y
sigma_OLS = (Y-X@beta_OLS).T@(Y-X@beta_OLS)/(len(Y)-len(beta_OLS))
theta_OLS = np.insert(beta_OLS,0,sigma_OLS)
trace0 = go.Scatter(
y = theta_gd_student,
name = 'student, gd',
mode = 'markers',
marker = dict(
size = 10,
color = 'steelblue'
)
)
trace1 = go.Scatter(
y = theta_MH_student,
name = 'student, MH',
mode = 'markers',
marker = dict(
size = 10,
color = 'steelblue',
line = dict(
width = 2, color = "black")
)
)
trace2 = go.Scatter(
y = theta_gd_gaussian,
name = 'gaussian, gd',
mode = 'markers',
marker = dict(
size = 10,
color = 'red',
)
)
trace3 = go.Scatter(
y = theta_MH_gaussian,
name = 'gaussian, MH',
mode = 'markers',
marker = dict(
size = 10,
color = 'red',
line = dict(
width = 2, color = "black")
)
)
trace4 = go.Scatter(
y = theta_OLS,
name = 'OLS estimate',
mode = 'markers',
marker = dict(
size = 10,
color = 'orange'
)
)
data = [trace0, trace1, trace2, trace3, trace4]
layout = dict(title = 'Result comparison',
yaxis = dict(zeroline = False),
xaxis = dict(zeroline = False)
)
fig = dict(data=data, layout=layout)
py.iplot(fig)
inter = Y
classes = inter.copy()
classes[Y >= 1] = 1
classes[Y < 1 ]= 2
classes[Y < 0] = 3
classes[Y < -1] = 4
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
def convert(n):
return flatui[int(n)]
colors = list(map(convert, classes) )
plt.scatter(np.arange(0,X.shape[0]),Y, c = colors)
plt.show()
inter = y_test
classes = inter.copy()
classes[y_test >= 1] = 1
classes[y_test < 1 ]= 2
classes[y_test < 0] = 3
classes[y_test < -1] = 4
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
def convert(n):
return flatui[int(n)]
colors = list(map(convert, classes) )
plt.scatter(np.arange(0,X_test.shape[0]),y_test, c = colors)
plt.show()